rawdata <- read_csv('data/nycschooldata.csv')
## Rows: 1272 Columns: 161
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (36): Adjusted Grade, New?, Other Location Code in LCGMS, School Name, ...
## dbl (125): SED Code, District, Latitude, Longitude, Zip, Grade 3 ELA - All S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
school_df <- rawdata %>%
clean_names() %>%
select(4:41) %>%
na_if('N/A') %>%
mutate(
#community_school = ifelse(community_school == 'Yes', 1, 0),
economic_need_index = as.numeric(economic_need_index),
school_income_estimate = as.numeric(
gsub('[$,]', '', school_income_estimate)
),
percent_ell = as.numeric(gsub('[%]', '', percent_ell)) / 100,
percent_asian = as.numeric(gsub('[%]', '', percent_asian)) / 100,
percent_black = as.numeric(gsub('[%]', '', percent_black)) / 100,
percent_hispanic = as.numeric(gsub('[%]', '', percent_hispanic)) / 100,
percent_blhi = as.numeric(gsub('[%]', '', percent_black_hispanic)) / 100,
percent_white = as.numeric(gsub('[%]', '', percent_white)) / 100,
attendance_rate = as.numeric(
gsub('[%]', '', student_attendance_rate)
) / 100,
chronically_absent = as.numeric(
gsub('[%]', '', percent_of_students_chronically_absent)
) / 100,
rigorous_instruction = as.numeric(
gsub('[%]', '', rigorous_instruction_percent)
) / 100,
collab_teachers = as.numeric(
gsub('[%]', '', collaborative_teachers_percent)
) / 100,
supportive_env = as.numeric(
gsub('[%]', '', supportive_environment_percent)
) / 100,
leadership = as.numeric(
gsub('[%]', '', effective_school_leadership_percent)
) / 100,
fam_com_ties = as.numeric(
gsub('[%]', '', strong_family_community_ties_percent)
) / 100,
trust = as.numeric(gsub('[%]', '', trust_percent)) / 100,
ela_proficiency = as.numeric(average_ela_proficiency),
math_proficiency = as.numeric(average_math_proficiency),
avg_proficiency = (ela_proficiency + math_proficiency) / 2
) %>%
select(
-sed_code, -percent_black_hispanic, -student_attendance_rate,
-percent_of_students_chronically_absent, -collaborative_teachers_percent,
-rigorous_instruction_percent, -supportive_environment_percent,
-effective_school_leadership_percent, -strong_family_community_ties_percent,
-trust_percent, -average_ela_proficiency, -average_math_proficiency,
-grades, -grade_low, -grade_high, -ends_with('rating')
)
loc_df <- school_df %>%
select(school_name, location_code, district, latitude,
longitude, address_full, city, zip)
cor(select_if(school_df, is.numeric), use="pairwise.complete.obs") %>%
corrplot(method = 'shade', type = 'lower', diag = FALSE)
eda_p1 <- ggplot(school_df,
aes(x = ela_proficiency, y = math_proficiency,
color = school_income_estimate)) +
geom_point()
ggplot(school_df, aes(x = school_income_estimate)) +
geom_histogram(bins=35)
## Warning: Removed 396 rows containing non-finite values (`stat_bin()`).
unique(school_df$city)
## [1] "NEW YORK" "ROOSEVELT ISLAND" "BRONX"
## [4] "BROOKLYN" "ELMHURST" "WOODSIDE"
## [7] "CORONA" "MIDDLE VILLAGE" "MASPETH"
## [10] "RIDGEWOOD" "GLENDALE" "LONG ISLAND CITY"
## [13] "FLUSHING" "COLLEGE POINT" "WHITESTONE"
## [16] "BAYSIDE" "QUEENS VILLAGE" "LITTLE NECK"
## [19] "DOUGLASTON" "FLORAL PARK" "BELLEROSE"
## [22] "JAMAICA" "ARVERNE" "FAR ROCKAWAY"
## [25] "SOUTH OZONE PARK" "BROAD CHANNEL" "RICHMOND HILL"
## [28] "WOODHAVEN" "SOUTH RICHMOND HILL" "OZONE PARK"
## [31] "ROCKAWAY PARK" "HOWARD BEACH" "ROCKAWAY BEACH"
## [34] "KEW GARDENS" "FOREST HILLS" "REGO PARK"
## [37] "SPRINGFIELD GARDENS" "HOLLIS" "SAINT ALBANS"
## [40] "ROSEDALE" "CAMBRIA HEIGHTS" "JACKSON HEIGHTS"
## [43] "ASTORIA" "EAST ELMHURST" "STATEN ISLAND"
r <- GET('http://data.beta.nyc//dataset/0ff93d2d-90ba-457c-9f7e-39e47bf2ac5f/resource/35dd04fb-81b3-479b-a074-a27a37888ce7/download/d085e2f8d0b54d4590b1e7d1f35594c1pediacitiesnycneighborhoods.geojson')
nycmap <- rgdal::readOGR(content(r, 'text'), 'OGRGeoJSON', verbose=F)
## Warning: OGR support is provided by the sf and terra packages among others
## No encoding supplied: defaulting to UTF-8.
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
## Warning: OGR support is provided by the sf and terra packages among others
nycmaps_df <- tidy(nycmap)
## Regions defined for each Polygons
ggplot() +
geom_polygon(data=nycmaps_df, aes(x=long, y=lat, group=group),
color='white', fill='lightgrey') +
geom_point(data=school_df,
mapping=aes(x = longitude, y = latitude, color=city)) +
labs(title='Schools in the New York City Area by City') +
theme(panel.background = element_blank(), legend.position = 'none',
axis.title.x = element_blank(), axis.title.y = element_blank(),
axis.text.x = element_blank(), axis.text.y = element_blank(),
axis.ticks.x = element_blank(), axis.ticks.y = element_blank())
school_df
## # A tibble: 1,272 × 28
## school_…¹ locat…² distr…³ latit…⁴ longi…⁵ addre…⁶ city zip commu…⁷ econo…⁸
## <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl>
## 1 P.S. 015… 01M015 1 40.7 -74.0 333 E … NEW … 10009 Yes 0.919
## 2 P.S. 019… 01M019 1 40.7 -74.0 185 1S… NEW … 10003 No 0.641
## 3 P.S. 020… 01M020 1 40.7 -74.0 166 ES… NEW … 10002 No 0.744
## 4 P.S. 034… 01M034 1 40.7 -74.0 730 E … NEW … 10009 No 0.86
## 5 THE STAR… 01M063 1 40.7 -74.0 121 E … NEW … 10009 No 0.73
## 6 P.S. 064… 01M064 1 40.7 -74.0 600 E … NEW … 10009 No 0.858
## 7 P.S. 110… 01M110 1 40.7 -74.0 285 DE… NEW … 10002 No 0.499
## 8 P.S. 134… 01M134 1 40.7 -74.0 293 E … NEW … 10002 No 0.833
## 9 P.S. 140… 01M140 1 40.7 -74.0 123 RI… NEW … 10002 No 0.849
## 10 P.S. 142… 01M142 1 40.7 -74.0 100 AT… NEW … 10002 No 0.861
## # … with 1,262 more rows, 18 more variables: school_income_estimate <dbl>,
## # percent_ell <dbl>, percent_asian <dbl>, percent_black <dbl>,
## # percent_hispanic <dbl>, percent_white <dbl>, percent_blhi <dbl>,
## # attendance_rate <dbl>, chronically_absent <dbl>,
## # rigorous_instruction <dbl>, collab_teachers <dbl>, supportive_env <dbl>,
## # leadership <dbl>, fam_com_ties <dbl>, trust <dbl>, ela_proficiency <dbl>,
## # math_proficiency <dbl>, avg_proficiency <dbl>, and abbreviated variable …
ggplot(school_df, aes(community_school, avg_proficiency)) +
geom_boxplot()
## Warning: Removed 55 rows containing non-finite values (`stat_boxplot()`).
# MAJOR INDICATOR
unique(school_df$city)
## [1] "NEW YORK" "ROOSEVELT ISLAND" "BRONX"
## [4] "BROOKLYN" "ELMHURST" "WOODSIDE"
## [7] "CORONA" "MIDDLE VILLAGE" "MASPETH"
## [10] "RIDGEWOOD" "GLENDALE" "LONG ISLAND CITY"
## [13] "FLUSHING" "COLLEGE POINT" "WHITESTONE"
## [16] "BAYSIDE" "QUEENS VILLAGE" "LITTLE NECK"
## [19] "DOUGLASTON" "FLORAL PARK" "BELLEROSE"
## [22] "JAMAICA" "ARVERNE" "FAR ROCKAWAY"
## [25] "SOUTH OZONE PARK" "BROAD CHANNEL" "RICHMOND HILL"
## [28] "WOODHAVEN" "SOUTH RICHMOND HILL" "OZONE PARK"
## [31] "ROCKAWAY PARK" "HOWARD BEACH" "ROCKAWAY BEACH"
## [34] "KEW GARDENS" "FOREST HILLS" "REGO PARK"
## [37] "SPRINGFIELD GARDENS" "HOLLIS" "SAINT ALBANS"
## [40] "ROSEDALE" "CAMBRIA HEIGHTS" "JACKSON HEIGHTS"
## [43] "ASTORIA" "EAST ELMHURST" "STATEN ISLAND"
ggplot(school_df,
aes(x = avg_proficiency, y = economic_need_index,
color = school_income_estimate)
) +
geom_point() + scale_y_continuous()
## Warning: Removed 55 rows containing missing values (`geom_point()`).
count(school_df, city) # ASSUMING MANHATTAN = NEW YORK
## # A tibble: 45 × 2
## city n
## <chr> <int>
## 1 ARVERNE 2
## 2 ASTORIA 6
## 3 BAYSIDE 13
## 4 BELLEROSE 4
## 5 BROAD CHANNEL 1
## 6 BRONX 297
## 7 BROOKLYN 411
## 8 CAMBRIA HEIGHTS 2
## 9 COLLEGE POINT 2
## 10 CORONA 9
## # … with 35 more rows
ggplot(filter(school_df, city==c('NEW YORK', 'BRONX')), aes(avg_proficiency, latitude)) + geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, city==c('NEW YORK', 'BRONX')), aes(avg_proficiency, longitude)) + geom_point()
## Warning: Removed 13 rows containing missing values (`geom_point()`).
While it may seem unnecessary, this was graph was created to determine if there was any correlation between a school’s location.
ggplot(school_df, aes(x=percent_ell, y=avg_proficiency)) +
geom_point() + scale_x_sqrt()
## Warning: Removed 55 rows containing missing values (`geom_point()`).
ggplot(school_df, aes(x = percent_white, y = avg_proficiency)) +
geom_point() + scale_x_sqrt()
## Warning: Removed 55 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, attendance_rate > 0.7), aes(x = attendance_rate, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
# OUTLIERS REMOVED TO FOCUS ON THIS DISTRIBUTION
ggplot(filter(school_df, rigorous_instruction > 0),
aes(x = rigorous_instruction, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
# FILTERING OUT OUTLIERS AT 0
ggplot(filter(school_df, fam_com_ties > 0),
aes(x = fam_com_ties, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, collab_teachers > 0),
aes(x = collab_teachers, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
ggplot(school_df, aes(x = chronically_absent, y = avg_proficiency)) +
geom_point()
## Warning: Removed 55 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, leadership > 0), aes(x = leadership, y = avg_proficiency)) + geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, trust>0), aes(x = trust, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, supportive_env > 0),
aes(x = supportive_env, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
ggplot(filter(school_df, leadership > 0),
aes(x = leadership, y = avg_proficiency)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
school_final <- school_df %>%
select(-school_name, -location_code, -district, -latitude, -longitude,
-address_full, -city, -zip) %>%
filter(is.na(avg_proficiency) == FALSE)
colMeans(is.na(school_final))
## community_school economic_need_index school_income_estimate
## 0.0000000 0.0000000 0.3196385
## percent_ell percent_asian percent_black
## 0.0000000 0.0000000 0.0000000
## percent_hispanic percent_white percent_blhi
## 0.0000000 0.0000000 0.0000000
## attendance_rate chronically_absent rigorous_instruction
## 0.0000000 0.0000000 0.0000000
## collab_teachers supportive_env leadership
## 0.0000000 0.0000000 0.0000000
## fam_com_ties trust ela_proficiency
## 0.0000000 0.0000000 0.0000000
## math_proficiency avg_proficiency
## 0.0000000 0.0000000
##Linear Regression
Splitting into Training and Testing Sets
set.seed(101010)
school_split <- initial_split(school_final, prop=0.8, strata=avg_proficiency)
school_train <- training(school_split)
school_test <- testing(school_split)
Creating Recipe #1 (Community School as a Dummy Variable)
school_recipe <- recipe(avg_proficiency ~ ., data=school_train) %>%
step_dummy(all_nominal_predictors()) %>%
step_rm(c('math_proficiency', 'ela_proficiency', 'percent_asian',
'percent_black', 'percent_hispanic', 'percent_white')) %>%
step_impute_linear(school_income_estimate,
impute_with=imp_vars(
economic_need_index, starts_with('community')
)) %>%
step_interact(~ starts_with('community'):school_income_estimate +
attendance_rate:chronically_absent) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors())
lm_model <- linear_reg() %>%
set_engine('lm')
lm_wkflow <- workflow() %>%
add_model(lm_model) %>%
add_recipe(school_recipe)
lm_fit <- fit(lm_wkflow, school_train)
school_recipe
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 19
##
## Operations:
##
## Dummy variables from all_nominal_predictors()
## Variables removed c("math_proficiency", "ela_proficiency", "percent_asian...
## Linear regression imputation for school_income_estimate
## Interactions with starts_with("community"):school_income_estimate + ...
## Centering for all_numeric_predictors()
## Scaling for all_numeric_predictors()
Metrics
school_train_res <- predict(lm_fit, new_data = school_train %>%
select(-avg_proficiency))
school_test_res <- predict(lm_fit, new_data = school_test %>%
select(-avg_proficiency))
school_train_res <- bind_cols(school_train_res, school_train %>%
select(avg_proficiency))
school_train_res %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
geom_point(alpha = 0.2) +
geom_abline(lty = 2) +
theme_minimal() + coord_obs_pred()
school_test_res <- bind_cols(school_test_res, school_test %>%
select(avg_proficiency))
school_test_res %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
geom_point(alpha = 0.2) +
geom_abline(lty = 2) +
theme_minimal() + coord_obs_pred()
school_metrics <- metric_set(rmse, rsq, mae)
school_test_metrics <- metric_set(rmse, rsq, mae)
model_1_metrics <- bind_rows(
school_test_metrics(school_train_res, truth=avg_proficiency, estimate=.pred),
school_metrics(school_test_res, truth=avg_proficiency, estimate=.pred)
)
model_1_metrics
## # A tibble: 6 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.176
## 2 rsq standard 0.824
## 3 mae standard 0.130
## 4 rmse standard 0.181
## 5 rsq standard 0.776
## 6 mae standard 0.136
school_final
## # A tibble: 1,217 × 20
## community_s…¹ econo…² schoo…³ perce…⁴ perce…⁵ perce…⁶ perce…⁷ perce…⁸ perce…⁹
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Yes 0.919 31142. 0.09 0.05 0.32 0.6 0.01 0.92
## 2 No 0.641 56463. 0.05 0.1 0.2 0.63 0.06 0.83
## 3 No 0.744 44343. 0.15 0.35 0.08 0.49 0.04 0.57
## 4 No 0.86 31454 0.07 0.05 0.29 0.63 0.04 0.92
## 5 No 0.73 46436. 0.03 0.04 0.2 0.65 0.1 0.84
## 6 No 0.858 39415. 0.06 0.07 0.19 0.66 0.07 0.84
## 7 No 0.499 43707. 0.01 0.16 0.1 0.43 0.27 0.53
## 8 No 0.833 28821. 0.12 0.21 0.2 0.55 0.03 0.75
## 9 No 0.849 34889. 0.14 0.05 0.13 0.78 0.03 0.9
## 10 No 0.861 35545. 0.08 0.06 0.11 0.78 0.02 0.9
## # … with 1,207 more rows, 11 more variables: attendance_rate <dbl>,
## # chronically_absent <dbl>, rigorous_instruction <dbl>,
## # collab_teachers <dbl>, supportive_env <dbl>, leadership <dbl>,
## # fam_com_ties <dbl>, trust <dbl>, ela_proficiency <dbl>,
## # math_proficiency <dbl>, avg_proficiency <dbl>, and abbreviated variable
## # names ¹community_school, ²economic_need_index, ³school_income_estimate,
## # ⁴percent_ell, ⁵percent_asian, ⁶percent_black, ⁷percent_hispanic, …
filter(school_df, attendance_rate > 0) %>%
ggplot(aes(x = attendance_rate, y = avg_proficiency,
color = chronically_absent)) +
geom_point()
## Warning: Removed 30 rows containing missing values (`geom_point()`).
school_train %>%
ggplot(
aes(x = school_income_estimate, y = avg_proficiency,
shape = community_school)
) + geom_point(alpha=0.4)
## Warning: Removed 318 rows containing missing values (`geom_point()`).
Splitting the Data and Creating Folds
school_split <- initial_split(school_final, strata = avg_proficiency)
school_train <- training(school_split)
school_test <- testing(school_split)
school_folds <- vfold_cv(school_train, strata=avg_proficiency)
Recipe Creation (Same as Above, but using
step_normalize)
school_recipe <- recipe(avg_proficiency ~ ., data=school_train) %>%
step_dummy(all_nominal_predictors()) %>%
step_rm(c('math_proficiency', 'ela_proficiency', 'percent_asian',
'percent_black', 'percent_hispanic', 'percent_white')) %>%
step_impute_linear(school_income_estimate,
impute_with=imp_vars(
economic_need_index, starts_with('community')
)) %>%
step_interact(~ starts_with('community'):school_income_estimate +
attendance_rate:chronically_absent) %>%
step_normalize(all_numeric_predictors())
Tuning penalty and mixture.
school_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
set_mode('regression') %>%
set_engine('glmnet')
Creating the Workflow and Grid for Tuning
school_wkflow <- workflow() %>%
add_recipe(school_recipe) %>%
add_model(school_spec)
pen_mix_grid <- grid_regular(
penalty(range = c(-5, 5)), mixture(range = c(0, 1)), levels = 10
)
Fitting the Model
tune_res <- tune_grid(
school_wkflow,
resamples = school_folds,
grid = pen_mix_grid
)
## ! Fold01: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold02: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold03: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold04: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold05: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold06: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold07: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold08: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold09: internal: A correlation computation is required, but `estimate` is constant and ha...
## ! Fold10: internal: A correlation computation is required, but `estimate` is constant and ha...
Autoplot
autoplot(tune_res)
Fitting the Model to the Training Set
best_model <- select_best(tune_res, metric='rsq')
school_final_fit <- finalize_workflow(school_wkflow, best_model) %>%
fit(data = school_train)
train_results <- augment(school_final_fit, new_data = school_train)
train_results %>%
school_metrics(avg_proficiency, estimate=.pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.180
## 2 rsq standard 0.813
## 3 mae standard 0.133
Fitting the Model to the Testing Set
test_results <- augment(school_final_fit, new_data = school_test)
test_results %>%
school_metrics(avg_proficiency, estimate=.pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.168
## 2 rsq standard 0.813
## 3 mae standard 0.123
This model has a better \(R^2\) value than the prior, and thus will now be used further.
model_1_metrics
## # A tibble: 6 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.176
## 2 rsq standard 0.824
## 3 mae standard 0.130
## 4 rmse standard 0.181
## 5 rsq standard 0.776
## 6 mae standard 0.136
Predicting the Values so they can be Plotted
school_train_res <- predict(school_final_fit, new_data = school_train %>%
select(-avg_proficiency))
school_test_res <- predict(school_final_fit, new_data = school_test %>%
select(-avg_proficiency))
train_plot_results <- bind_cols(school_train_res, school_train %>%
select(avg_proficiency))
train_plot_results %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
geom_point(alpha = 0.2) +
geom_abline(lty = 2) +
theme_minimal() + coord_obs_pred() + labs(title='Training Set')
test_plot_results <- bind_cols(school_test_res, school_test %>%
select(avg_proficiency))
test_plot_results %>% ggplot(aes(x=.pred, y=avg_proficiency)) +
geom_point(alpha = 0.2) +
geom_abline(lty = 2) +
theme_minimal() + coord_obs_pred() + labs(title='Testing Set')